This notebook is a simple training pipeline in TensorFlow for the Cassava Leaf Competition where we are given 21,397 labeled images of cassava leaves classified as 5 different groups (4 diseases and a healthy group) and asked to predict on unseen images of cassava leaves. As with most image classification problems, we can use and experiment with many different forms of augmentation and we can explore transfer learning.

%%writefile conditional_cell_extension.py
def run_if(line, cell=None):
    '''Execute current line/cell if line evaluates to True.'''
    if not eval(line):
        return
    get_ipython().ex(cell)

def load_ipython_extension(shell):
    '''Registers the run_if magic when the extension loads.'''
    shell.register_magic_function(run_if, 'line_cell')

def unload_ipython_extension(shell):
    '''Unregisters the run_if magic when the extension unloads.'''
    del shell.magics_manager.magics['cell']['run_if']
Writing conditional_cell_extension.py
%reload_ext conditional_cell_extension

Note: I am using Dimitre’s TFRecords that can be found here. He also has 128x128, 256x256, and 384x384 sized images that I added for experimental purposes. Please give his datasets an upvote (and his work in general, it is excellent).
import numpy as np
import pandas as pd
import seaborn as sns
import albumentations as A
import matplotlib.pyplot as plt
import os, gc, cv2, random, warnings, math, sys, json, pprint, pdb

import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')
print(f"Using TensorFlow v{tf.__version__}")
Using TensorFlow v2.4.0
#@title Accelerator type { run: "auto", display-mode: "form" }
DEVICE = 'GPU' #@param ["None", "'GPU'", "'TPU'"] {type:"raw", allow-input: true}

if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTOTUNE = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')
Using default strategy for CPU and single GPU
Num GPUs Available:  1
REPLICAS: 1
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

GOOGLE = 'google.colab' in str(get_ipython())
KAGGLE = not GOOGLE

print("Running on {}!".format(
   "Google Colab" if GOOGLE else "Kaggle Kernel"
))
Running on Google Colab!

Tip: Adding seed helps reproduce results. Setting debug parameter wil run the model on smaller number of epochs to validate the architecture.
#@title ML Liftcycle { run: "auto", display-mode:"form" }
SEED = 16
DEBUG = False #@param {type:"boolean"}
TRAIN = True #@param {type:"boolean"}
INFERENCE = True #@param {type:"boolean"}

seed_everything(SEED)
%%run_if {GOOGLE}
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
Mounted at /content/gdrive
project_name = 'cassava-leaf-disease-classification'
root_path  = '/content/gdrive/MyDrive/' if GOOGLE else '/'
input_path = f'{root_path}kaggle/input/{project_name}/'
working_path = f'{input_path}working/' if GOOGLE else '/kaggle/working/'
os.makedirs(working_path, exist_ok=True)
os.chdir(working_path)
os.listdir(input_path)
['label_num_to_disease_map.json',
 'sample_submission.csv',
 'train.csv',
 'cassava-leaf-disease-classification.zip',
 'test_images',
 'test_tfrecords',
 'train_images',
 'train_tfrecords',
 'dump.tfcache.data-00000-of-00001',
 'dump.tfcache.index',
 'working']

Hyperparameters

BASE_MODEL, IMG_SIZE = ('efficientnet_b3', 300) #@param ["('efficientnet_b3', 300)", "('efficientnet_b4', 380)", "('efficientnet_b2', 260)"] {type:"raw", allow-input: true}
BATCH_SIZE = 32 #@param {type:"integer"}
IMG_SIZE = (IMG_SIZE, IMG_SIZE, 3) #@param ["(IMG_SIZE, IMG_SIZE, 3)", "(512,512, 3)"] {type:"raw"}
print("Using {} with input size {}".format(BASE_MODEL, IMG_SIZE))
Using efficientnet_b3 with input size (300, 300, 3)

Data

Exploring data

df = pd.read_csv(f'{input_path}train.csv')
df.head()
image_id label
0 1000015157.jpg 0
1 1000201771.jpg 3
2 100042118.jpg 1
3 1000723321.jpg 1
4 1000812911.jpg 3

Check how many images are available in the training dataset and also check if each item in the training set are unique

print(f"There are {len(df)} train images")
len(df.image_id) == len(df.image_id.unique())
There are 21397 train images
True
(df.label.value_counts(normalize=True) * 100).plot.barh(figsize = (8, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f65b215ea90>
df['filename'] = df['image_id'].map(lambda x : f'{input_path}train_images/{x}')
df = df.drop(columns = ['image_id'])
df = df.sample(frac=1).reset_index(drop=True)
df.head()
label filename
0 3 /content/gdrive/MyDrive/kaggle/input/cassava-l...
1 3 /content/gdrive/MyDrive/kaggle/input/cassava-l...
2 3 /content/gdrive/MyDrive/kaggle/input/cassava-l...
3 3 /content/gdrive/MyDrive/kaggle/input/cassava-l...
4 3 /content/gdrive/MyDrive/kaggle/input/cassava-l...
if DEBUG:
    _, df = train_test_split(
        df,
        test_size = 0.1,
        random_state=SEED,
        shuffle=True,
        stratify=df['label'])

In this case, we have 5 labels (4 diseases and healthy):

with open(f'{input_path}label_num_to_disease_map.json') as file:
    id2label = json.loads(file.read())
id2label
{'0': 'Cassava Bacterial Blight (CBB)',
 '1': 'Cassava Brown Streak Disease (CBSD)',
 '2': 'Cassava Green Mottle (CGM)',
 '3': 'Cassava Mosaic Disease (CMD)',
 '4': 'Healthy'}

From the bar chart shown earlier, the label 3, Cassava Mosaic Disease (CMD) is the most common one. This imbalance may have to be addressed with a weighted loss function or oversampling. I might try this in a future iteration of this kernel or in a new kernel.

Let's check an example image to see what it looks like

from PIL import Image
img = Image.open(df[df.label==3]['filename'].iloc[0])
width, height = img.size
print(f"The size of the image is W{width} x H{height}")
The size of the image is W800 x H600
img

Loading data

After my quick and rough EDA, let's load the PIL Image to a Numpy array, so we can move on to data augmentation.

In fastai, they have item_tfms and batch_tfms defined for their data loader API. The item transforms performs a fairly large crop to 224 and also apply other standard augmentations (in aug_tranforms) at the batch level on the GPU. The batch size is set to 32 here.

Split the dataset into training set and validation set

train_df, valid_df = train_test_split(
    df
    ,test_size = 0.2
    ,random_state = SEED
    ,shuffle = True
    ,stratify = df['label'])
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df.filename.values,train_df.label.values))
valid_ds = tf.data.Dataset.from_tensor_slices(
    (valid_df.filename.values, valid_df.label.values))
adapt_ds = tf.data.Dataset.from_tensor_slices(
    train_df.filename.values)
for x,y in valid_ds.take(3): print(x, y)
tf.Tensor(b'/content/gdrive/MyDrive/kaggle/input/cassava-leaf-disease-classification/train_images/2484271873.jpg', shape=(), dtype=string) tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(b'/content/gdrive/MyDrive/kaggle/input/cassava-leaf-disease-classification/train_images/3704210007.jpg', shape=(), dtype=string) tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(b'/content/gdrive/MyDrive/kaggle/input/cassava-leaf-disease-classification/train_images/1655615998.jpg', shape=(), dtype=string) tf.Tensor(2, shape=(), dtype=int64)

Data transformation

In this stage we will collating the data and the label, and then do some basic data transformation so the image size can fit to the input size of the model.

Basically item transformations mainly make sure the input data is of the same size so that it can be collated in batches.

Important: You may have noticed that I had not used any kind of normalization or rescaling. I recently discovered that there is Normalization layer included in Keras’ pretrained EfficientNet, as mentioned here.
def decode_image(filename):
    img = tf.io.read_file(filename)
    img = tf.image.decode_jpeg(img, channels=3)
    return img
  
def collate_train(filename, label):
    img = decode_image(filename)
    img = tf.image.random_brightness(img, 0.3)
    img = tf.image.random_flip_left_right(img, seed=None)
    img = tf.image.random_crop(img, IMG_SIZE)
    return img, label

def process_adapt(filename):
    img = decode_image(filename)
    img = tf.keras.layers.experimental.preprocessing.Rescaling(1.0 / 255)(img)
    return img

def collate_valid(filename, label):
    img = decode_image(filename)
    img = tf.image.random_crop(img, IMG_SIZE)
    return img, label
train_ds = train_ds.map(collate_train, num_parallel_calls=AUTOTUNE)
valid_ds = valid_ds.map(collate_valid, num_parallel_calls=AUTOTUNE)
adapt_ds = adapt_ds.map(process_adapt, num_parallel_calls=AUTOTUNE)
train_ds_batch = (train_ds
                  .cache('dump.tfcache')
                  .shuffle(buffer_size=1000)
                  .batch(BATCH_SIZE)
                  .prefetch(buffer_size=AUTOTUNE))

valid_ds_batch = (valid_ds
                  #.shuffle(buffer_size=1000)
                  .batch(BATCH_SIZE*2)
                  .prefetch(buffer_size=AUTOTUNE))

adapt_ds_batch = (adapt_ds
                  .shuffle(buffer_size=1000)
                  .batch(BATCH_SIZE)
                  .prefetch(buffer_size=AUTOTUNE))
def show_images(ds):
    _,axs = plt.subplots(3,3,figsize=(16,16))
    for ((x, y), ax) in zip(ds.take(9), axs.flatten()):
        ax.imshow(x.numpy().astype(np.uint8))
        ax.set_title(np.argmax(y))
        ax.axis('off')
show_images(train_ds)
show_images(valid_ds)

Model

Batch augmentation

data_augmentation = tf.keras.Sequential(
    [
     tf.keras.layers.experimental.preprocessing.RandomCrop(*IMG_SIZE),
     tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
     tf.keras.layers.experimental.preprocessing.RandomRotation(0.25),
     tf.keras.layers.experimental.preprocessing.RandomZoom((-0.2, 0)),
     tf.keras.layers.experimental.preprocessing.RandomContrast((0.2,0.2))
    ]
)
func = lambda x,y: (data_augmentation(x), y)
x = (train_ds
     .batch(BATCH_SIZE)
     .take(1)
     .map(func, num_parallel_calls=AUTOTUNE))
show_images(x.unbatch())

Building a model

I am using an EfficientNetB3 on top of which I add some output layers to predict our 5 disease classes. I decided to load the imagenet pretrained weights locally to keep the internet off (part of the requirements to submit a kernal to this competition).

%%run_if {KAGGLE}
subprocess.check_call([sys.executable, '-m', 'pip', 'install','-q', 'efficientnet'])
from efficientnet.tfkeras import EfficientNetB3
%%run_if {GOOGLE}
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications import VGG16
def build_model(base_model, num_class):
    inputs = tf.keras.layers.Input(shape=IMG_SIZE)
    x = data_augmentation(inputs)
    x = base_model(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    outputs = tf.keras.layers.Dense(num_class, activation="softmax", name="pred")(x)
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    return model
efficientnet = EfficientNetB3(
    weights = 'imagenet' if TRAIN else None, 
    include_top = False, 
    input_shape = IMG_SIZE, 
    pooling='avg')
efficientnet.trainable = True
model = build_model(base_model=efficientnet, num_class=len(id2label))
model.summary()
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_4 (InputLayer)         [(None, 300, 300, 3)]     0         
_________________________________________________________________
sequential (Sequential)      (None, 300, 300, 3)       0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
dropout_1 (Dropout)          (None, 1536)              0         
_________________________________________________________________
pred (Dense)                 (None, 5)                 7685      
=================================================================
Total params: 10,791,220
Trainable params: 10,703,917
Non-trainable params: 87,303
_________________________________________________________________

Fine tune

The 3rd layer of the Efficient is the Normalization layer, which can be tuned to our new dataset instead of imagenet. Be patient on this one, it does take a bit of time as we're going through the entire training set.

%%run_if {GOOGLE}
if TRAIN:
    if not os.path.exists("000_normalization.h5"):
        model.get_layer('efficientnetb3').get_layer('normalization').adapt(adapt_ds_batch)
        model.save_weights("000_normalization.h5")
    else:
        model.load_weights("000_normalization.h5")

LR finder

class MultiplicativeLearningRate(tf.keras.callbacks.Callback):
    def __init__(self, factor):
        self.factor = factor
        self.losses = []
        self.lrs = []

    def on_batch_end(self, batch, logs):
        self.lrs.append(K.get_value(self.model.optimizer.lr))
        self.losses.append(logs["loss"])
        K.set_value(self.model.optimizer.lr, self.model.optimizer.lr*self.factor)
        

def find_lr(model, ds, batch_size, num_iter, min_lr=1e-6, max_lr=1e1):
    num_iter = len(x) // batch_size
    lr_factor = np.exp(np.log(max_lr / min_lr) / num_iter)

    # Train for 1 epoch, starting with minimum learning rate and increase it
    K.set_value(model.optimizer.lr, min_lr)
    lr_callback = MultiplicativeLearningRate(lr_factor)
    model.fit(ds, epochs=1, batch_size=batch_size, callbacks=[lr_callback])
    
    # Plot loss vs log-scaled learning rate
    plot = sns.lineplot(lr_callback.lrs, lr_callback.losses)
    plot.set(xscale="log", 
             xlabel="Learning Rate (log-scale)", 
             ylabel="Training Loss",
             title="Optimal learning rate is slightly below minimum",
             facecolor="#F0F0F0")
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(schedule),
              metrics=["accuracy"])
find_lr(model, train_ds_batch, num_iter=len(train_df)//BATCH_SIZE, batch_size=BATCH_SIZE)

Optimizer

CosineDecay

Important: I always wanted to try the new CosineDecayRestarts function implemented in tf.keras as it seemed promising and I struggled to find the right settings (if there were any) for the ReduceLROnPlateau
EPOCHS = 8
STEPS = int(round(len(train_df)/BATCH_SIZE)) * EPOCHS

schedule = tf.keras.experimental.CosineDecayRestarts(
    initial_learning_rate=1e-4,
    first_decay_steps=300
)
schedule.get_config()
{'alpha': 0.0,
 'first_decay_steps': 300,
 'initial_learning_rate': 0.0001,
 'm_mul': 1.0,
 'name': None,
 't_mul': 2.0}
x = [i for i in range(STEPS)]
y = [schedule(s) for s in range(STEPS)]
plt.plot(x, y)
[<matplotlib.lines.Line2D at 0x7f6557800940>]

Warning: There is a gap between what I had expected and the acutal LearningRateScheduler that tensorflow gives us. The LearningRateScheduler update the lr on_epoch_begin while it makes more sense to do it on_batch_end or on_batch_begin.

Callbacks

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='001_best_model.h5',
        monitor='val_loss',
        save_best_only=True),
    ]

model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(schedule),
              metrics=["accuracy"])

Training

if TRAIN:
    history = model.fit(train_ds_batch,
                        epochs = EPOCHS,
                        validation_data=valid_ds_batch,
                        callbacks=callbacks)
Epoch 1/8
12/54 [=====>........................] - ETA: 21:15 - loss: 0.4106 - accuracy: 0.8684

Evaluating

def plot_hist(hist):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Loss over epochs')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='best')
    plt.show()
if TRAIN:
    plot_hist(history)

We load the best weight that were kept from the training phase. Just to check how our model is performing, we will attempt predictions over the validation set. This can help to highlight any classes that will be consistently miscategorised.

model.load_weights('001_best_model.h5')

Prediction

x = train_df.sample(1).filename.values[0]
img = decode_image(x)
%%time
imgs = [tf.image.random_crop(img, size=[*IMG_SIZE, 3]) for _ in range(4)]

_,axs = plt.subplots(1,4,figsize=(16,4))
for (x, ax) in zip(imgs, axs.flatten()):
    ax.imshow(x.numpy().astype(np.uint8))
    ax.axis('off')
CPU times: user 57.3 ms, sys: 870 µs, total: 58.2 ms
Wall time: 62.1 ms

I apply some very basic test time augmentation to every local image extracted from the original 600-by-800 images. We know we can do some fancy augmentation with albumentations but I wanted to do that exclusively with Keras preprocessing layers to keep the cleanest pipeline possible.

tta = tf.keras.Sequential(
    [
        tf.keras.layers.experimental.preprocessing.RandomCrop((*IMG_SIZE)),
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
        tf.keras.layers.experimental.preprocessing.RandomZoom((-0.2, 0.2)),
        tf.keras.layers.experimental.preprocessing.RandomContrast((0.2,0.2))
    ]
)
def predict_tta(filename, num_tta=4):
    img = decode_image(filename)
    img = tf.expand_dims(img, 0)
    imgs = tf.concat([tta(img) for _ in range(num_tta)], 0)
    preds = model.predict(imgs)
    return preds.sum(0).argmax()
pred = predict_tta(df.sample(1).filename.values[0])
print(pred)
3
if INFERENCE:
    from tqdm import tqdm
    preds = []
    with tqdm(total=len(valid_df)) as pbar:
        for filename in valid_df.filename:
            pbar.update()
            preds.append(predict_tta(filename, num_tta=4))
100%|██████████| 4280/4280 [25:34<00:00,  2.79it/s]
if INFERENCE:
    cm = tf.math.confusion_matrix(valid_df.label.values, np.array(preds))
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm,
                xticklabels=id2label.values(),
                yticklabels=id2label.values(), 
                annot=True,
                fmt='g',
                cmap="Blues")
    plt.xlabel('Prediction')
    plt.ylabel('Label')
    plt.show()
test_folder = input_path + '/test_images/'
submission_df = pd.DataFrame(columns={"image_id","label"})
submission_df["image_id"] = os.listdir(test_folder)
submission_df["label"] = 0
submission_df['label'] = (submission_df['image_id']
                            .map(lambda x : predict_tta(test_folder+x)))
submission_df
image_id label
0 2216849948.jpg 4
submission_df.to_csv("submission.csv", index=False)

1% Better Everyday

reference


todos

  • See if I can integrate the Cutmix/Mixup augmentations in the appendix into our existing notebook. This is an excellent example
  • Still want to figure out some intuition of item aug and batch aug. I don't know, maybe there is some limitation or how to do so to help to speed up.
  • Learn more about the adapt function that being used to retrain the normalization layer of the EfficientNetB3.

done

  • Predict in batch to speed up
  • Add a cell for checkbox parameter to select between kaggle and colab, default is Kaggle.
  • Try out the data_generator and the data_frame_iterator
  • Removing normalizaiton step in generator since in EfficientNet, normalization is done within the model itself and the model expects input in the range of [0,255]
  • Find out the intuition and the difference between item_tfm and batch_tfm

    In fastai, item_tfm defines the transforms that are done on the CPU and batch_tfm defines those done on the GPU.

  • Customize my own data generator as fastai creates their Dataloader

    No need, things are much easier than what I was originally expecting. Please refer to the Loading data section in this notebook.

  • The 3rd layer of the Efficientnet is the Normalization layer, which can be tuned to our new dataset instead of imagenet. Be patient on this one, it does take a bit of time we're going through the entire training set.

  • Add seed_everything function

Appendix

The albumentation is primarily used for resizing and normalization.

def albu_transforms_train(data_resize): 
    return A.Compose([
            A.ToFloat(),
            A.Resize(data_resize, data_resize),
        ], p=1.)

# For Validation 
def albu_transforms_valid(data_resize): 
    return A.Compose([
            A.ToFloat(),
            A.Resize(data_resize, data_resize),
        ], p=1.)
def CutMix(image, label, DIM, PROBABILITY = 1.0):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with cutmix applied
    CLASSES = 5
    
    imgs = []; labs = []
    for j in range(len(image)):
        # DO CUTMIX WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.int32)
        
        # CHOOSE RANDOM IMAGE TO CUTMIX WITH
        k = tf.cast( tf.random.uniform([],0,len(image)),tf.int32)
        
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        
        b = tf.random.uniform([],0,1) # this is beta dist with alpha=1.0
        
        WIDTH = tf.cast( DIM * tf.math.sqrt(1-b),tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)

        # MAKE CUTMIX IMAGE
        one = image[j,ya:yb,0:xa,:]
        two = image[k,ya:yb,xa:xb,:]
        three = image[j,ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        img = tf.concat([image[j,0:ya,:,:],middle,image[j,yb:DIM,:,:]],axis=0)
        imgs.append(img)
        
        # MAKE CUTMIX LABEL
        a = tf.cast(WIDTH*WIDTH/DIM/DIM,tf.float32)
        labs.append((1-a)*label[j] + a*label[k])
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(len(image),DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(len(image),CLASSES))
    
    return image2,label2
def MixUp(image, label, DIM, PROBABILITY = 1.0):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with mixup applied
    CLASSES = 5
    
    imgs = []; labs = []
    for j in range(len(image)):
        # DO MIXUP WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.float32)
                   
        # CHOOSE RANDOM
        k = tf.cast( tf.random.uniform([],0,len(image)),tf.int32)
        a = tf.random.uniform([],0,1)*P # this is beta dist with alpha=1.0
                    
        # MAKE MIXUP IMAGE
        img1 = image[j,]
        img2 = image[k,]
        imgs.append((1-a)*img1 + a*img2)
                    
        # MAKE CUTMIX LABEL
        labs.append((1-a)*label[j] + a*label[k])
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(len(image),DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(len(image),CLASSES))
    return image2,label2